{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Split by HTML section"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Document(page_content='Foo \\n Some intro text about Foo.', metadata={'Header 1': 'Foo'}),\n",
       " Document(page_content='Bar main section \\n Some intro text about Bar. \\n Bar subsection 1 \\n Some text about the first subtopic of Bar. \\n Bar subsection 2 \\n Some text about the second subtopic of Bar.', metadata={'Header 2': 'Bar main section'}),\n",
       " Document(page_content='Baz \\n Some text about Baz \\n \\n \\n Some concluding text about Foo', metadata={'Header 2': 'Baz'})]"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from langchain_text_splitters import HTMLSectionSplitter\n",
    "\n",
    "html_string = \"\"\"\n",
    "    <!DOCTYPE html>\n",
    "    <html>\n",
    "    <body>\n",
    "        <div>\n",
    "            <h1>Foo</h1>\n",
    "            <p>Some intro text about Foo.</p>\n",
    "            <div>\n",
    "                <h2>Bar main section</h2>\n",
    "                <p>Some intro text about Bar.</p>\n",
    "                <h3>Bar subsection 1</h3>\n",
    "                <p>Some text about the first subtopic of Bar.</p>\n",
    "                <h3>Bar subsection 2</h3>\n",
    "                <p>Some text about the second subtopic of Bar.</p>\n",
    "            </div>\n",
    "            <div>\n",
    "                <h2>Baz</h2>\n",
    "                <p>Some text about Baz</p>\n",
    "            </div>\n",
    "            <br>\n",
    "            <p>Some concluding text about Foo</p>\n",
    "        </div>\n",
    "    </body>\n",
    "    </html>\n",
    "\"\"\"\n",
    "\n",
    "headers_to_split_on = [(\"h1\", \"Header 1\"), (\"h2\", \"Header 2\")]\n",
    "\n",
    "html_splitter = HTMLSectionSplitter(headers_to_split_on=headers_to_split_on)\n",
    "html_header_splits = html_splitter.split_text(html_string)\n",
    "html_header_splits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Document(page_content='Foo \\n Some intro text about Foo.', metadata={'Header 1': 'Foo'}),\n",
       " Document(page_content='Bar main section \\n Some intro text about Bar.', metadata={'Header 2': 'Bar main section'}),\n",
       " Document(page_content='Bar subsection 1 \\n Some text about the first subtopic of Bar.', metadata={'Header 3': 'Bar subsection 1'}),\n",
       " Document(page_content='Bar subsection 2 \\n Some text about the second subtopic of Bar.', metadata={'Header 3': 'Bar subsection 2'}),\n",
       " Document(page_content='Baz \\n Some text about Baz \\n \\n \\n Some concluding text about Foo', metadata={'Header 2': 'Baz'})]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "\n",
    "html_string = \"\"\"\n",
    "    <!DOCTYPE html>\n",
    "    <html>\n",
    "    <body>\n",
    "        <div>\n",
    "            <h1>Foo</h1>\n",
    "            <p>Some intro text about Foo.</p>\n",
    "            <div>\n",
    "                <h2>Bar main section</h2>\n",
    "                <p>Some intro text about Bar.</p>\n",
    "                <h3>Bar subsection 1</h3>\n",
    "                <p>Some text about the first subtopic of Bar.</p>\n",
    "                <h3>Bar subsection 2</h3>\n",
    "                <p>Some text about the second subtopic of Bar.</p>\n",
    "            </div>\n",
    "            <div>\n",
    "                <h2>Baz</h2>\n",
    "                <p>Some text about Baz</p>\n",
    "            </div>\n",
    "            <br>\n",
    "            <p>Some concluding text about Foo</p>\n",
    "        </div>\n",
    "    </body>\n",
    "    </html>\n",
    "\"\"\"\n",
    "\n",
    "headers_to_split_on = [\n",
    "    (\"h1\", \"Header 1\"),\n",
    "    (\"h2\", \"Header 2\"),\n",
    "    (\"h3\", \"Header 3\"),\n",
    "    (\"h4\", \"Header 4\"),\n",
    "]\n",
    "\n",
    "html_splitter = HTMLSectionSplitter(headers_to_split_on=headers_to_split_on)\n",
    "\n",
    "html_header_splits = html_splitter.split_text(html_string)\n",
    "\n",
    "chunk_size = 500\n",
    "chunk_overlap = 30\n",
    "text_splitter = RecursiveCharacterTextSplitter(\n",
    "    chunk_size=chunk_size, chunk_overlap=chunk_overlap\n",
    ")\n",
    "\n",
    "# Split\n",
    "splits = text_splitter.split_documents(html_header_splits)\n",
    "splits"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "langchain0_1",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
